{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "header"
      },
      "source": [
        "# Romeo and Juliet Text Extraction with LangExtract\n",
        "\n",
        "This notebook demonstrates extracting characters, emotions, and relationships from Shakespeare's Romeo and Juliet using LangExtract.\n",
        "\n",
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/langextract/blob/main/examples/notebooks/romeo_juliet_extraction.ipynb)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "setup_header"
      },
      "source": [
        "## Setup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "id": "install"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Note: you may need to restart the kernel to use updated packages.\n"
          ]
        }
      ],
      "source": [
        "# Install LangExtract\n",
        "%pip install -q langextract"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {
        "id": "api_key"
      },
      "outputs": [],
      "source": [
        "# Set up your Gemini API key\n",
        "# Get your key from: https://aistudio.google.com/app/apikey\n",
        "import os\n",
        "from getpass import getpass\n",
        "\n",
        "if 'GEMINI_API_KEY' not in os.environ:\n",
        "    os.environ['GEMINI_API_KEY'] = getpass('Enter your Gemini API key: ')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "define_header"
      },
      "source": [
        "## Define Extraction Task"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "setup_extraction"
      },
      "outputs": [],
      "source": [
        "import langextract as lx\n",
        "import textwrap\n",
        "\n",
        "# Define the extraction task\n",
        "prompt = textwrap.dedent(\"\"\"\\\n",
        "    Extract characters, emotions, and relationships in order of appearance.\n",
        "    Use exact text for extractions. Do not paraphrase or overlap entities.\n",
        "    Provide meaningful attributes for each entity to add context.\"\"\")\n",
        "\n",
        "# Provide a high-quality example\n",
        "examples = [\n",
        "    lx.data.ExampleData(\n",
        "        text=\"ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.\",\n",
        "        extractions=[\n",
        "            lx.data.Extraction(\n",
        "                extraction_class=\"character\",\n",
        "                extraction_text=\"ROMEO\",\n",
        "                attributes={\"emotional_state\": \"wonder\"}\n",
        "            ),\n",
        "            lx.data.Extraction(\n",
        "                extraction_class=\"emotion\",\n",
        "                extraction_text=\"But soft!\",\n",
        "                attributes={\"feeling\": \"gentle awe\"}\n",
        "            ),\n",
        "            lx.data.Extraction(\n",
        "                extraction_class=\"relationship\",\n",
        "                extraction_text=\"Juliet is the sun\",\n",
        "                attributes={\"type\": \"metaphor\"}\n",
        "            ),\n",
        "        ]\n",
        "    )\n",
        "]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "extract_header"
      },
      "source": [
        "## Extract from Sample Text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "id": "simple_extraction"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\u001b[94m\u001b[1mLangExtract\u001b[0m: model=\u001b[92mgemini-2.5-flash\u001b[0m, current=\u001b[92m68\u001b[0m chars, processed=\u001b[92m68\u001b[0m chars:  [00:01]"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[92m✓\u001b[0m Extraction processing complete\n",
            "\u001b[92m✓\u001b[0m Extracted \u001b[1m3\u001b[0m entities (\u001b[1m3\u001b[0m unique types)\n",
            "  \u001b[96m•\u001b[0m Time: \u001b[1m1.96s\u001b[0m\n",
            "  \u001b[96m•\u001b[0m Speed: \u001b[1m35\u001b[0m chars/sec\n",
            "  \u001b[96m•\u001b[0m Chunks: \u001b[1m1\u001b[0m\n",
            "Extracted 3 entities:\n",
            "\n",
            "• character: 'Lady Juliet'\n",
            "  - emotional_state: longing\n",
            "• emotion: 'gazed longingly at the stars, her heart aching'\n",
            "  - feeling: melancholy longing\n",
            "• relationship: 'her heart aching for Romeo'\n",
            "  - type: romantic\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        }
      ],
      "source": [
        "# Simple extraction from a short text\n",
        "input_text = \"Lady Juliet gazed longingly at the stars, her heart aching for Romeo\"\n",
        "\n",
        "result = lx.extract(\n",
        "    text_or_documents=input_text,\n",
        "    prompt_description=prompt,\n",
        "    examples=examples,\n",
        "    model_id=\"gemini-2.5-flash\",\n",
        ")\n",
        "\n",
        "# Display results\n",
        "print(f\"Extracted {len(result.extractions)} entities:\\n\")\n",
        "for extraction in result.extractions:\n",
        "    print(f\"• {extraction.extraction_class}: '{extraction.extraction_text}'\")\n",
        "    if extraction.attributes:\n",
        "        for key, value in extraction.attributes.items():\n",
        "            print(f\"  - {key}: {value}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "viz_header"
      },
      "source": [
        "## Interactive Visualization"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "id": "visualization"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\u001b[94m\u001b[1mLangExtract\u001b[0m: Saving to \u001b[92mromeo_juliet.jsonl\u001b[0m: 1 docs [00:00, 995.33 docs/s]"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[92m✓\u001b[0m Saved \u001b[1m1\u001b[0m documents to \u001b[92mromeo_juliet.jsonl\u001b[0m\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n",
            "\u001b[94m\u001b[1mLangExtract\u001b[0m: Loading \u001b[92mromeo_juliet.jsonl\u001b[0m: 100%|██████████| 961/961 [00:00<00:00, 2.49MB/s]"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[92m✓\u001b[0m Loaded \u001b[1m1\u001b[0m documents from \u001b[92mromeo_juliet.jsonl\u001b[0m\n",
            "Interactive visualization (hover over highlights to see attributes):\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "<style>\n",
              ".lx-highlight { position: relative; border-radius:3px; padding:1px 2px;}\n",
              ".lx-highlight .lx-tooltip {\n",
              "  visibility: hidden;\n",
              "  opacity: 0;\n",
              "  transition: opacity 0.2s ease-in-out;\n",
              "  background: #333;\n",
              "  color: #fff;\n",
              "  text-align: left;\n",
              "  border-radius: 4px;\n",
              "  padding: 6px 8px;\n",
              "  position: absolute;\n",
              "  z-index: 1000;\n",
              "  bottom: 125%;\n",
              "  left: 50%;\n",
              "  transform: translateX(-50%);\n",
              "  font-size: 12px;\n",
              "  max-width: 240px;\n",
              "  white-space: normal;\n",
              "  box-shadow: 0 2px 6px rgba(0,0,0,0.3);\n",
              "}\n",
              ".lx-highlight:hover .lx-tooltip { visibility: visible; opacity:1; }\n",
              ".lx-animated-wrapper { max-width: 100%; font-family: Arial, sans-serif; }\n",
              ".lx-controls {\n",
              "  background: #fafafa; border: 1px solid #90caf9; border-radius: 8px;\n",
              "  padding: 12px; margin-bottom: 16px;\n",
              "}\n",
              ".lx-button-row {\n",
              "  display: flex; justify-content: center; gap: 8px; margin-bottom: 12px;\n",
              "}\n",
              ".lx-control-btn {\n",
              "  background: #4285f4; color: white; border: none; border-radius: 4px;\n",
              "  padding: 8px 16px; cursor: pointer; font-size: 13px; font-weight: 500;\n",
              "  transition: background-color 0.2s;\n",
              "}\n",
              ".lx-control-btn:hover { background: #3367d6; }\n",
              ".lx-progress-container {\n",
              "  margin-bottom: 8px;\n",
              "}\n",
              ".lx-progress-slider {\n",
              "  width: 100%; margin: 0; appearance: none; height: 6px;\n",
              "  background: #ddd; border-radius: 3px; outline: none;\n",
              "}\n",
              ".lx-progress-slider::-webkit-slider-thumb {\n",
              "  appearance: none; width: 18px; height: 18px; background: #4285f4;\n",
              "  border-radius: 50%; cursor: pointer;\n",
              "}\n",
              ".lx-progress-slider::-moz-range-thumb {\n",
              "  width: 18px; height: 18px; background: #4285f4; border-radius: 50%;\n",
              "  cursor: pointer; border: none;\n",
              "}\n",
              ".lx-status-text {\n",
              "  text-align: center; font-size: 12px; color: #666; margin-top: 4px;\n",
              "}\n",
              ".lx-text-window {\n",
              "  font-family: monospace; white-space: pre-wrap; border: 1px solid #90caf9;\n",
              "  padding: 12px; max-height: 260px; overflow-y: auto; margin-bottom: 12px;\n",
              "  line-height: 1.6;\n",
              "}\n",
              ".lx-attributes-panel {\n",
              "  background: #fafafa; border: 1px solid #90caf9; border-radius: 6px;\n",
              "  padding: 8px 10px; margin-top: 8px; font-size: 13px;\n",
              "}\n",
              ".lx-current-highlight {\n",
              "  border-bottom: 4px solid #ff4444;\n",
              "  font-weight: bold;\n",
              "  animation: lx-pulse 1s ease-in-out;\n",
              "}\n",
              "@keyframes lx-pulse {\n",
              "  0% { text-decoration-color: #ff4444; }\n",
              "  50% { text-decoration-color: #ff0000; }\n",
              "  100% { text-decoration-color: #ff4444; }\n",
              "}\n",
              ".lx-legend {\n",
              "  font-size: 12px; margin-bottom: 8px;\n",
              "  padding-bottom: 8px; border-bottom: 1px solid #e0e0e0;\n",
              "}\n",
              ".lx-label {\n",
              "  display: inline-block;\n",
              "  padding: 2px 4px;\n",
              "  border-radius: 3px;\n",
              "  margin-right: 4px;\n",
              "  color: #000;\n",
              "}\n",
              ".lx-attr-key {\n",
              "  font-weight: 600;\n",
              "  color: #1565c0;\n",
              "  letter-spacing: 0.3px;\n",
              "}\n",
              ".lx-attr-value {\n",
              "  font-weight: 400;\n",
              "  opacity: 0.85;\n",
              "  letter-spacing: 0.2px;\n",
              "}\n",
              "\n",
              "/* Add optimizations with larger fonts and better readability for GIFs */\n",
              ".lx-gif-optimized .lx-text-window { font-size: 16px; line-height: 1.8; }\n",
              ".lx-gif-optimized .lx-attributes-panel { font-size: 15px; }\n",
              ".lx-gif-optimized .lx-current-highlight { text-decoration-thickness: 4px; }\n",
              "</style>\n",
              "<div class=\"lx-animated-wrapper lx-gif-optimized\">\n",
              "  <div class=\"lx-attributes-panel\">\n",
              "    <div class=\"lx-legend\">Highlights Legend: <span class=\"lx-label\" style=\"background-color:#D2E3FC;\">character</span> <span class=\"lx-label\" style=\"background-color:#C8E6C9;\">emotion</span> <span class=\"lx-label\" style=\"background-color:#FEF0C3;\">relationship</span></div>\n",
              "    <div id=\"attributesContainer\"></div>\n",
              "  </div>\n",
              "  <div class=\"lx-text-window\" id=\"textWindow\">\n",
              "    <span class=\"lx-highlight lx-current-highlight\" data-idx=\"0\" style=\"background-color:#D2E3FC;\">Lady Juliet</span> <span class=\"lx-highlight\" data-idx=\"1\" style=\"background-color:#C8E6C9;\">gazed longingly at the stars, <span class=\"lx-highlight\" data-idx=\"2\" style=\"background-color:#FEF0C3;\">her heart aching</span> for Romeo</span>\n",
              "  </div>\n",
              "  <div class=\"lx-controls\">\n",
              "    <div class=\"lx-button-row\">\n",
              "      <button class=\"lx-control-btn\" onclick=\"playPause()\">▶️ Play</button>\n",
              "      <button class=\"lx-control-btn\" onclick=\"prevExtraction()\">⏮ Previous</button>\n",
              "      <button class=\"lx-control-btn\" onclick=\"nextExtraction()\">⏭ Next</button>\n",
              "    </div>\n",
              "    <div class=\"lx-progress-container\">\n",
              "      <input type=\"range\" id=\"progressSlider\" class=\"lx-progress-slider\"\n",
              "             min=\"0\" max=\"2\" value=\"0\"\n",
              "             onchange=\"jumpToExtraction(this.value)\">\n",
              "    </div>\n",
              "    <div class=\"lx-status-text\">\n",
              "      Entity <span id=\"entityInfo\">1/3</span> |\n",
              "      Pos <span id=\"posInfo\">[0-11]</span>\n",
              "    </div>\n",
              "  </div>\n",
              "</div>\n",
              "\n",
              "<script>\n",
              "  (function() {\n",
              "    const extractions = [{\"index\": 0, \"class\": \"character\", \"text\": \"Lady Juliet\", \"color\": \"#D2E3FC\", \"startPos\": 0, \"endPos\": 11, \"beforeText\": \"\", \"extractionText\": \"Lady Juliet\", \"afterText\": \" gazed longingly at the stars, her heart aching for Romeo\", \"attributesHtml\": \"<div><strong>class:</strong> character</div><div><strong>attributes:</strong> {<span class=\\\"lx-attr-key\\\">emotional_state</span>: <span class=\\\"lx-attr-value\\\">longing</span>}</div>\"}, {\"index\": 1, \"class\": \"emotion\", \"text\": \"gazed longingly at the stars, her heart aching\", \"color\": \"#C8E6C9\", \"startPos\": 12, \"endPos\": 58, \"beforeText\": \"Lady Juliet \", \"extractionText\": \"gazed longingly at the stars, her heart aching\", \"afterText\": \" for Romeo\", \"attributesHtml\": \"<div><strong>class:</strong> emotion</div><div><strong>attributes:</strong> {<span class=\\\"lx-attr-key\\\">feeling</span>: <span class=\\\"lx-attr-value\\\">melancholy longing</span>}</div>\"}, {\"index\": 2, \"class\": \"relationship\", \"text\": \"her heart aching for Romeo\", \"color\": \"#FEF0C3\", \"startPos\": 42, \"endPos\": 68, \"beforeText\": \"Lady Juliet gazed longingly at the stars, \", \"extractionText\": \"her heart aching for Romeo\", \"afterText\": \"\", \"attributesHtml\": \"<div><strong>class:</strong> relationship</div><div><strong>attributes:</strong> {<span class=\\\"lx-attr-key\\\">type</span>: <span class=\\\"lx-attr-value\\\">romantic</span>}</div>\"}];\n",
              "    let currentIndex = 0;\n",
              "    let isPlaying = false;\n",
              "    let animationInterval = null;\n",
              "    let animationSpeed = 1.0;\n",
              "\n",
              "    function updateDisplay() {\n",
              "      const extraction = extractions[currentIndex];\n",
              "      if (!extraction) return;\n",
              "\n",
              "      document.getElementById('attributesContainer').innerHTML = extraction.attributesHtml;\n",
              "      document.getElementById('entityInfo').textContent = (currentIndex + 1) + '/' + extractions.length;\n",
              "      document.getElementById('posInfo').textContent = '[' + extraction.startPos + '-' + extraction.endPos + ']';\n",
              "      document.getElementById('progressSlider').value = currentIndex;\n",
              "\n",
              "      const playBtn = document.querySelector('.lx-control-btn');\n",
              "      if (playBtn) playBtn.textContent = isPlaying ? '⏸ Pause' : '▶️ Play';\n",
              "\n",
              "      const prevHighlight = document.querySelector('.lx-text-window .lx-current-highlight');\n",
              "      if (prevHighlight) prevHighlight.classList.remove('lx-current-highlight');\n",
              "      const currentSpan = document.querySelector('.lx-text-window span[data-idx=\"' + currentIndex + '\"]');\n",
              "      if (currentSpan) {\n",
              "        currentSpan.classList.add('lx-current-highlight');\n",
              "        currentSpan.scrollIntoView({block: 'center', behavior: 'smooth'});\n",
              "      }\n",
              "    }\n",
              "\n",
              "    function nextExtraction() {\n",
              "      currentIndex = (currentIndex + 1) % extractions.length;\n",
              "      updateDisplay();\n",
              "    }\n",
              "\n",
              "    function prevExtraction() {\n",
              "      currentIndex = (currentIndex - 1 + extractions.length) % extractions.length;\n",
              "      updateDisplay();\n",
              "    }\n",
              "\n",
              "    function jumpToExtraction(index) {\n",
              "      currentIndex = parseInt(index);\n",
              "      updateDisplay();\n",
              "    }\n",
              "\n",
              "    function playPause() {\n",
              "      if (isPlaying) {\n",
              "        clearInterval(animationInterval);\n",
              "        isPlaying = false;\n",
              "      } else {\n",
              "        animationInterval = setInterval(nextExtraction, animationSpeed * 1000);\n",
              "        isPlaying = true;\n",
              "      }\n",
              "      updateDisplay();\n",
              "    }\n",
              "\n",
              "    window.playPause = playPause;\n",
              "    window.nextExtraction = nextExtraction;\n",
              "    window.prevExtraction = prevExtraction;\n",
              "    window.jumpToExtraction = jumpToExtraction;\n",
              "\n",
              "    updateDisplay();\n",
              "  })();\n",
              "</script>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "execution_count": 12,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Save results to JSONL\n",
        "lx.io.save_annotated_documents([result], output_name=\"romeo_juliet.jsonl\", output_dir=\".\")\n",
        "\n",
        "# Generate interactive visualization\n",
        "html_content = lx.visualize(\"romeo_juliet.jsonl\")\n",
        "\n",
        "# Display in notebook\n",
        "print(\"Interactive visualization (hover over highlights to see attributes):\")\n",
        "html_content"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "save_viz"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "✓ Visualization saved to romeo_juliet_visualization.html\n",
            "You can download this file from the Files panel on the left.\n"
          ]
        }
      ],
      "source": [
        "# Save visualization to file (for downloading)\n",
        "with open(\"romeo_juliet_visualization.html\", \"w\") as f:\n",
        "    # Handle both Jupyter (HTML object) and non-Jupyter (string) environments\n",
        "    if hasattr(html_content, 'data'):\n",
        "        f.write(html_content.data)\n",
        "    else:\n",
        "        f.write(html_content)\n",
        "\n",
        "print(\"✓ Visualization saved to romeo_juliet_visualization.html\")\n",
        "print(\"You can download this file from the Files panel on the left.\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "experiment_header"
      },
      "source": [
        "## Try Your Own Text\n",
        "\n",
        "Experiment with your own Shakespeare quotes or any literary text!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "id": "experiment"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\u001b[94m\u001b[1mLangExtract\u001b[0m: model=\u001b[92mgemini-2.5-flash\u001b[0m, current=\u001b[92m163\u001b[0m chars, processed=\u001b[92m163\u001b[0m chars:  [00:05]"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\u001b[92m✓\u001b[0m Extraction processing complete\n",
            "\u001b[92m✓\u001b[0m Extracted \u001b[1m6\u001b[0m entities (\u001b[1m3\u001b[0m unique types)\n",
            "  \u001b[96m•\u001b[0m Time: \u001b[1m5.84s\u001b[0m\n",
            "  \u001b[96m•\u001b[0m Speed: \u001b[1m28\u001b[0m chars/sec\n",
            "  \u001b[96m•\u001b[0m Chunks: \u001b[1m1\u001b[0m\n",
            "Extractions from your text:\n",
            "\n",
            "• character: 'JULIET'\n",
            "  - emotional_state: longing\n",
            "• emotion: 'O Romeo, Romeo! wherefore art thou Romeo?'\n",
            "  - feeling: desperate questioning\n",
            "• relationship: 'thy father'\n",
            "  - type: familial\n",
            "• relationship: 'thy name'\n",
            "  - type: lineage\n",
            "• relationship: 'my love'\n",
            "  - type: romantic bond\n",
            "• relationship: 'Capulet'\n",
            "  - type: family affiliation\n"
          ]
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "\n"
          ]
        }
      ],
      "source": [
        "# Try your own text\n",
        "your_text = \"\"\"\n",
        "JULIET: O Romeo, Romeo! wherefore art thou Romeo?\n",
        "Deny thy father and refuse thy name;\n",
        "Or, if thou wilt not, be but sworn my love,\n",
        "And I'll no longer be a Capulet.\n",
        "\"\"\"\n",
        "\n",
        "custom_result = lx.extract(\n",
        "    text_or_documents=your_text,\n",
        "    prompt_description=prompt,\n",
        "    examples=examples,\n",
        "    model_id=\"gemini-2.5-flash\",\n",
        ")\n",
        "\n",
        "print(\"Extractions from your text:\\n\")\n",
        "for e in custom_result.extractions:\n",
        "    print(f\"• {e.extraction_class}: '{e.extraction_text}'\")\n",
        "    if e.attributes:\n",
        "        for key, value in e.attributes.items():\n",
        "            print(f\"  - {key}: {value}\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "Romeo and Juliet Text Extraction with LangExtract",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "venv",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.13.5"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
